import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
pd.set_option('display.max_columns', None)
raw_df = pd.read_csv("../datasets/EDA_df.csv")
raw_df.head(10)
| customer_id | gender | senior_citizen | partner | dependents | phone_service | multiple_lines | online_security | online_backup | device_protection | tech_support | streaming_tv | streaming_movies | paperless_billing | internet_service_DSL | internet_service_fiber_optic | no_internet_service | contract_month_to_month | contract_one_year | contract_two_year | payment_method_bank_transfer_automatic | payment_method_credit_card_automatic | payment_method_electronic_check | payment_method_mailed_check | tenure | monthly_charges | total_charges | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 29.85 | 29.85 | 0 |
| 1 | 5575-GNVDE | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 34 | 56.95 | 1936.30 | 0 |
| 2 | 3668-QPYBK | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2 | 53.85 | 107.70 | 1 |
| 3 | 7795-CFOCW | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 45 | 42.30 | 1903.50 | 0 |
| 4 | 9237-HQITU | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2 | 70.70 | 141.40 | 1 |
| 5 | 9305-CDSKC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8 | 99.65 | 797.20 | 1 |
| 6 | 1452-KIOVK | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 22 | 89.10 | 1960.20 | 0 |
| 7 | 6713-OKOMC | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 10 | 29.75 | 297.50 | 0 |
| 8 | 7892-POOKP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 28 | 104.80 | 2934.40 | 1 |
| 9 | 6388-TABGU | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 62 | 56.15 | 3481.30 | 0 |
X_cluster_2_features = raw_df.loc[:, ['tenure', 'monthly_charges']]
fig = px.scatter(X_cluster_2_features, x="tenure", y="monthly_charges", color=raw_df['churn'], size=raw_df['total_charges'])
fig.update_layout(
title = 'Tenure and Monthly Charges',
title_x = 0.5,
xaxis_title = 'Tenure',
yaxis_title = 'Monthly Charges'
)
fig.show()
Centroid yang dipilih adalah 4, karena hasil cluster bisa dijelaskan menggunakan visualisasi.
kmeans_4_centroid = KMeans(n_clusters=4, random_state=42)
X_cluster_2_features['cluster_pred'] = kmeans_4_centroid.fit_predict(X_cluster_2_features)
cluster_pred_4_centroid = X_cluster_2_features
cluster_pred_4_centroid.head(10)
| tenure | monthly_charges | cluster_pred | |
|---|---|---|---|
| 0 | 1 | 29.85 | 3 |
| 1 | 34 | 56.95 | 0 |
| 2 | 2 | 53.85 | 3 |
| 3 | 45 | 42.30 | 0 |
| 4 | 2 | 70.70 | 2 |
| 5 | 8 | 99.65 | 2 |
| 6 | 22 | 89.10 | 2 |
| 7 | 10 | 29.75 | 3 |
| 8 | 28 | 104.80 | 2 |
| 9 | 62 | 56.15 | 0 |
fig = px.scatter(cluster_pred_4_centroid, x="tenure", y="monthly_charges", color="cluster_pred", size=raw_df['total_charges'])
fig.show()
Menggunakan 27 fitur.
wcss = []
for i in range(1, 11):
kmeans = KMeans(i, random_state=42)
kmeans.fit(raw_df.iloc[:, 1:28])
wcss.append(kmeans.inertia_)
fig = go.Figure(data=go.Scatter(x=np.arange(1, 11), y=wcss))
fig.update_layout(
title = 'Within-Cluster Sum of Squares',
title_x = 0.5,
xaxis_title = 'Number of Clusters',
yaxis_title = 'Score'
)
fig.show()
kmeans_2_centroid = KMeans(n_clusters=2, random_state=42)
raw_df['cluster_pred'] = kmeans_2_centroid.fit_predict(raw_df.iloc[:, 1:28])
cluster_pred_df = raw_df
fig = px.scatter(cluster_pred_df, x="tenure", y="monthly_charges", color="cluster_pred", size='total_charges')
fig.show()
fig = px.scatter_3d(cluster_pred_df, x='tenure', y='monthly_charges', z='total_charges', color='cluster_pred')
fig.show()
without_oversampling_df = pd.read_csv('../datasets/without-oversampling/EDA_df.csv')
without_oversampling_df.shape
(7043, 28)
aic_scores = []
bic_scores = []
for i in range(1, 21):
gm = GaussianMixture(n_components=i, random_state=42).fit(without_oversampling_df.iloc[:, 1:28])
aic_scores.append(gm.aic(without_oversampling_df.iloc[:, 1:28]))
bic_scores.append(gm.bic(without_oversampling_df.iloc[:, 1:28]))
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = np.arange(1, 21),
y = aic_scores,
mode = 'lines+markers',
name = 'AIC'
)
)
fig.add_trace(
go.Scatter(
x = np.arange(1, 21),
y = bic_scores,
mode = 'lines+markers',
name = 'BIC'
)
)
fig.update_layout(
title = 'Cluster Selection with AIC and BIC',
title_x = 0.5,
xaxis_title = 'Number of Clusters',
yaxis_title = 'Score'
)
fig.show()
X_without_oversampling = without_oversampling_df.iloc[:, 1:28]
gm_pred = GaussianMixture(n_components=2, random_state=42).fit_predict(X_without_oversampling)
X_without_oversampling['gm_pred'] = gm_pred
X_without_oversampling.head(10)
| gender | senior_citizen | partner | dependents | phone_service | multiple_lines | online_security | online_backup | device_protection | tech_support | streaming_tv | streaming_movies | paperless_billing | internet_service_DSL | internet_service_fiber_optic | no_internet_service | contract_month_to_month | contract_one_year | contract_two_year | payment_method_bank_transfer_automatic | payment_method_credit_card_automatic | payment_method_electronic_check | payment_method_mailed_check | tenure | monthly_charges | total_charges | churn | gm_pred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 29.85 | 29.85 | 0 | 1 |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 34 | 56.95 | 1936.30 | 0 | 1 |
| 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2 | 53.85 | 107.70 | 1 | 1 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 45 | 42.30 | 1903.50 | 0 | 1 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2 | 70.70 | 141.40 | 1 | 1 |
| 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8 | 99.65 | 797.20 | 1 | 1 |
| 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 22 | 89.10 | 1960.20 | 0 | 1 |
| 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 10 | 29.75 | 297.50 | 0 | 1 |
| 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 28 | 104.80 | 2934.40 | 1 | 1 |
| 9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 62 | 56.15 | 3481.30 | 0 | 1 |
fig = px.scatter(X_without_oversampling, x="tenure", y="monthly_charges", color="gm_pred", size='total_charges')
fig.show()
fig = px.scatter_3d(X_without_oversampling, x='tenure', y='monthly_charges', z='total_charges', color='gm_pred')
fig.show()
print(f"Number of anomaly datapoint: {X_without_oversampling[(X_without_oversampling['monthly_charges'] >= 40) & (X_without_oversampling['monthly_charges'] <= 60) & (X_without_oversampling['gm_pred'] == 0)].shape[0]}")
Number of anomaly datapoint: 207
fig = px.scatter_3d(X_without_oversampling, x='tenure', y='monthly_charges', z='total_charges', color='churn')
fig.show()
oversampling_df = pd.read_csv('../datasets/oversampling/EDA_telco_customer_churn.csv')
oversampling_df.shape
(10348, 28)
X_oversampling = oversampling_df.iloc[:, 1:28]
aic_scores = []
bic_scores = []
for i in range(1, 21):
gm = GaussianMixture(n_components=i, random_state=42).fit(X_oversampling)
aic_scores.append(gm.aic(without_oversampling_df.iloc[:, 1:28]))
bic_scores.append(gm.bic(without_oversampling_df.iloc[:, 1:28]))
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = np.arange(1, 21),
y = aic_scores,
mode = 'lines+markers',
name = 'AIC'
)
)
fig.add_trace(
go.Scatter(
x = np.arange(1, 21),
y = bic_scores,
mode = 'lines+markers',
name = 'BIC'
)
)
fig.update_layout(
title = 'Cluster Selection with AIC and BIC',
title_x = 0.5,
xaxis_title = 'Number of Clusters',
yaxis_title = 'Score'
)
fig.show()
gm_pred = GaussianMixture(n_components=2, random_state=42).fit_predict(X_oversampling)
X_oversampling['gm_pred'] = gm_pred
X_oversampling.head(10)
| gender | senior_citizen | partner | dependents | phone_service | multiple_lines | online_security | online_backup | device_protection | tech_support | streaming_tv | streaming_movies | paperless_billing | internet_service_DSL | internet_service_fiber_optic | no_internet_service | contract_month_to_month | contract_one_year | contract_two_year | payment_method_bank_transfer_automatic | payment_method_credit_card_automatic | payment_method_electronic_check | payment_method_mailed_check | tenure | monthly_charges | total_charges | churn | gm_pred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 29.85 | 29.85 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 34 | 56.95 | 1936.30 | 0 | 1 |
| 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2 | 53.85 | 107.70 | 1 | 1 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 45 | 42.30 | 1903.50 | 0 | 1 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2 | 70.70 | 141.40 | 1 | 1 |
| 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8 | 99.65 | 797.20 | 1 | 1 |
| 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 22 | 89.10 | 1960.20 | 0 | 1 |
| 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 10 | 29.75 | 297.50 | 0 | 1 |
| 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 28 | 104.80 | 2934.40 | 1 | 1 |
| 9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 62 | 56.15 | 3481.30 | 0 | 1 |
fig = px.scatter(X_oversampling, x="tenure", y="monthly_charges", color="gm_pred", size='total_charges')
fig.show()
fig = px.scatter_3d(X_oversampling, x='tenure', y='monthly_charges', z='total_charges', color='gm_pred')
fig.show()
print(f"Number of anomaly datapoint: {X_oversampling[(X_oversampling['monthly_charges'] >= 40) & (X_oversampling['monthly_charges'] <= 60) & (X_oversampling['gm_pred'] == 0)].shape[0]}")
Number of anomaly datapoint: 426
Jumlah anomali yang terdeteksi bertambah dengan menggunakan dataset oversampling. Anomali terjadi pada datapoin dengan monthly charges berada direntang 40-60 dollars.
Untuk mengecek kepastian anomali, kami mengecek distribusi tenure, monthly charges, dan total charges menggunakan box plot dan distribution plot.
fig = px.box(oversampling_df, x="tenure")
fig.show()
hist_data = [oversampling_df['tenure']]
group_labels = ['tenure']
fig = ff.create_distplot(hist_data, group_labels)
fig.show()
fig = px.box(oversampling_df, x="monthly_charges")
fig.show()
hist_data = [oversampling_df['monthly_charges']]
group_labels = ['monthly charges']
fig = ff.create_distplot(hist_data, group_labels)
fig.show()
fig = px.box(oversampling_df, x="total_charges")
fig.show()
hist_data = [oversampling_df['total_charges']]
group_labels = ['total charges']
fig = ff.create_distplot(hist_data, group_labels)
fig.show()